# -*- coding: utf-8 -*-
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import gensim, re
from gensim import corpora, models 
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity


tokenizer = RegexpTokenizer(r'\w+')
# Create p_stemmer of class PorterStemmer
texts = []

# loop through document list
for line in open('D:\Implementations\Experiments\JabRef2.6\Source\CorpusRaw-AfterSplitStopStem.corpusRawMethodLevelGranularity'):
   
   
    texts.append(tokenizer.tokenize(line))

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word = dictionary, num_topics=500,   iterations=1000, passes=5)
#alpha=1/500, eta=1/500,t_tau =offset, k=decay, η=eta, K=num-topics [offset=1024, decay=0.9: for historical simulation]
index = MatrixSimilarity(ldamodel[corpus])
for line in open('D:\Implementations\Experiments\JabRef2.6\Source\queries-AfterSplitStopStem.txt'):
    new_vec = dictionary.doc2bow(tokenizer.tokenize(line))
    doc_lda = ldamodel[new_vec]
    sims = index[doc_lda]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print sims
    
